// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <cstdint>

template <int64_t atom_w>
void conv_dw_f5s1_h2w4_kernel_riscv_fp32(
    const float *src,
    const float *flt,
    const float *bias,
    float *dst,

    int64_t src_pad_w,
    int64_t dst_h,
    int64_t dst_w)
{
    asm volatile(
        ".equ           ATOM_W, %c[ATOM_W]      \n\t"

        "addi           t0,     zero,   4       \n\t"
        "vsetvli        t1,     t0,     e32     \n\t"

        "addi           t1,     zero,   16      \n\t"
        "mul            t4,     %[DT_W],t1      \n\t" // dst_h_addr_stride = dst_w * 16

        "mv             t1,     %[SRC]          \n\t"
        "mv             t3,     %[DST]          \n\t"
        "mv             t5,     %[DT_H]         \n\t"

        "addi           s2,     zero,   2       \n\t"
        "addi           s3,     zero,   4       \n\t"
        // load bias    : v29
        "vle.v          v29,    (%[BIAS])       \n\t"

        "0:                                     \n\t"
        "mv             s4,     t1              \n\t"
        "mv             s8,     t3              \n\t"
        "blt            t5,     s2,     4f      \n\t"
        "blt            %[DT_W],s3,     2f      \n\t"
        "mv             s7,     %[DT_W]         \n\t"

        "1:                                     \n\t"
        "vmv.v.v        v0,     v29             \n\t"
        "vmv.v.v        v1,     v29             \n\t"
        "vmv.v.v        v2,     v29             \n\t"
        "vmv.v.v        v3,     v29             \n\t"
        "vmv.v.v        v4,     v29             \n\t"
        "vmv.v.v        v5,     v29             \n\t"
        "vmv.v.v        v6,     v29             \n\t"
        "vmv.v.v        v7,     v29             \n\t"
        // load filter  : v24-v28 (f00, f01, f02, f03, f04)
        "mv             t0,     %[FLT]          \n\t"
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line0 -- v8-v15
        //              : line1 -- v16-v23
        "mv             s10,    s4              \n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v15,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"

        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v23,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        // f00
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        "vfmacc.vv      v3,     v24,    v11     \n\t"
        "vfmacc.vv      v4,     v24,    v16     \n\t"
        "vfmacc.vv      v5,     v24,    v17     \n\t"
        "vfmacc.vv      v6,     v24,    v18     \n\t"
        "vfmacc.vv      v7,     v24,    v19     \n\t"
        // f01
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        "vfmacc.vv      v3,     v25,    v12     \n\t"
        "vfmacc.vv      v4,     v25,    v17     \n\t"
        "vfmacc.vv      v5,     v25,    v18     \n\t"
        "vfmacc.vv      v6,     v25,    v19     \n\t"
        "vfmacc.vv      v7,     v25,    v20     \n\t"
        //f02
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        "vfmacc.vv      v3,     v26,    v13     \n\t"
        "vfmacc.vv      v4,     v26,    v18     \n\t"
        "vfmacc.vv      v5,     v26,    v19     \n\t"
        "vfmacc.vv      v6,     v26,    v20     \n\t"
        "vfmacc.vv      v7,     v26,    v21     \n\t"
        //f03
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        "vfmacc.vv      v3,     v27,    v14     \n\t"
        "vfmacc.vv      v4,     v27,    v19     \n\t"
        "vfmacc.vv      v5,     v27,    v20     \n\t"
        "vfmacc.vv      v6,     v27,    v21     \n\t"
        "vfmacc.vv      v7,     v27,    v22     \n\t"
        //f04
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        "vfmacc.vv      v3,     v28,    v15     \n\t"
        "vfmacc.vv      v4,     v28,    v20     \n\t"
        "vfmacc.vv      v5,     v28,    v21     \n\t"
        "vfmacc.vv      v6,     v28,    v22     \n\t"
        "vfmacc.vv      v7,     v28,    v23     \n\t"
        // load filter  : v24-v28 (f10, f11, f12, f13, f14)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line2 -- v8-v15
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v15,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        // f10
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        "vfmacc.vv      v3,     v24,    v19     \n\t"
        "vfmacc.vv      v4,     v24,    v8      \n\t"
        "vfmacc.vv      v5,     v24,    v9      \n\t"
        "vfmacc.vv      v6,     v24,    v10     \n\t"
        "vfmacc.vv      v7,     v24,    v11     \n\t"
        //f11
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        "vfmacc.vv      v3,     v25,    v20     \n\t"
        "vfmacc.vv      v4,     v25,    v9      \n\t"
        "vfmacc.vv      v5,     v25,    v10     \n\t"
        "vfmacc.vv      v6,     v25,    v11     \n\t"
        "vfmacc.vv      v7,     v25,    v12     \n\t"
        //f12
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        "vfmacc.vv      v3,     v26,    v21     \n\t"
        "vfmacc.vv      v4,     v26,    v10     \n\t"
        "vfmacc.vv      v5,     v26,    v11     \n\t"
        "vfmacc.vv      v6,     v26,    v12     \n\t"
        "vfmacc.vv      v7,     v26,    v13     \n\t"
        //f13
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        "vfmacc.vv      v3,     v27,    v22     \n\t"
        "vfmacc.vv      v4,     v27,    v11     \n\t"
        "vfmacc.vv      v5,     v27,    v12     \n\t"
        "vfmacc.vv      v6,     v27,    v13     \n\t"
        "vfmacc.vv      v7,     v27,    v14     \n\t"
        //f14
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        "vfmacc.vv      v3,     v28,    v23     \n\t"
        "vfmacc.vv      v4,     v28,    v12     \n\t"
        "vfmacc.vv      v5,     v28,    v13     \n\t"
        "vfmacc.vv      v6,     v28,    v14     \n\t"
        "vfmacc.vv      v7,     v28,    v15     \n\t"
        // load filter  : v24-v28 (f20, f21, f22, f23, f24)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line3 -- v16-v23
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v23,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        //f20
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        "vfmacc.vv      v3,     v24,    v11     \n\t"
        "vfmacc.vv      v4,     v24,    v16     \n\t"
        "vfmacc.vv      v5,     v24,    v17     \n\t"
        "vfmacc.vv      v6,     v24,    v18     \n\t"
        "vfmacc.vv      v7,     v24,    v19     \n\t"
        //f21
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        "vfmacc.vv      v3,     v25,    v12     \n\t"
        "vfmacc.vv      v4,     v25,    v17     \n\t"
        "vfmacc.vv      v5,     v25,    v18     \n\t"
        "vfmacc.vv      v6,     v25,    v19     \n\t"
        "vfmacc.vv      v7,     v25,    v20     \n\t"
        //f22
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        "vfmacc.vv      v3,     v26,    v13     \n\t"
        "vfmacc.vv      v4,     v26,    v18     \n\t"
        "vfmacc.vv      v5,     v26,    v19     \n\t"
        "vfmacc.vv      v6,     v26,    v20     \n\t"
        "vfmacc.vv      v7,     v26,    v21     \n\t"
        //f23
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        "vfmacc.vv      v3,     v27,    v14     \n\t"
        "vfmacc.vv      v4,     v27,    v19     \n\t"
        "vfmacc.vv      v5,     v27,    v20     \n\t"
        "vfmacc.vv      v6,     v27,    v21     \n\t"
        "vfmacc.vv      v7,     v27,    v22     \n\t"
        //f24
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        "vfmacc.vv      v3,     v28,    v15     \n\t"
        "vfmacc.vv      v4,     v28,    v20     \n\t"
        "vfmacc.vv      v5,     v28,    v21     \n\t"
        "vfmacc.vv      v6,     v28,    v22     \n\t"
        "vfmacc.vv      v7,     v28,    v23     \n\t"
        // load filter  : v24-v28 (f30, f31, f32, f33, f34)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line4 -- v8-v15
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v15,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        //f30
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        "vfmacc.vv      v3,     v24,    v19     \n\t"
        "vfmacc.vv      v4,     v24,    v8      \n\t"
        "vfmacc.vv      v5,     v24,    v9      \n\t"
        "vfmacc.vv      v6,     v24,    v10     \n\t"
        "vfmacc.vv      v7,     v24,    v11     \n\t"
        //f31
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        "vfmacc.vv      v3,     v25,    v20     \n\t"
        "vfmacc.vv      v4,     v25,    v9      \n\t"
        "vfmacc.vv      v5,     v25,    v10     \n\t"
        "vfmacc.vv      v6,     v25,    v11     \n\t"
        "vfmacc.vv      v7,     v25,    v12     \n\t"
        //f32
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        "vfmacc.vv      v3,     v26,    v21     \n\t"
        "vfmacc.vv      v4,     v26,    v10     \n\t"
        "vfmacc.vv      v5,     v26,    v11     \n\t"
        "vfmacc.vv      v6,     v26,    v12     \n\t"
        "vfmacc.vv      v7,     v26,    v13     \n\t"
        //f33
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        "vfmacc.vv      v3,     v27,    v22     \n\t"
        "vfmacc.vv      v4,     v27,    v11     \n\t"
        "vfmacc.vv      v5,     v27,    v12     \n\t"
        "vfmacc.vv      v6,     v27,    v13     \n\t"
        "vfmacc.vv      v7,     v27,    v14     \n\t"
        //f34
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        "vfmacc.vv      v3,     v28,    v23     \n\t"
        "vfmacc.vv      v4,     v28,    v12     \n\t"
        "vfmacc.vv      v5,     v28,    v13     \n\t"
        "vfmacc.vv      v6,     v28,    v14     \n\t"
        "vfmacc.vv      v7,     v28,    v15     \n\t"
        // load filter  : v24-v28 (f40, f41, f42, f43, f44)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line5 -- v16-v23
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v23,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        //f40
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        "vfmacc.vv      v3,     v24,    v11     \n\t"
        "vfmacc.vv      v4,     v24,    v16     \n\t"
        "vfmacc.vv      v5,     v24,    v17     \n\t"
        "vfmacc.vv      v6,     v24,    v18     \n\t"
        "vfmacc.vv      v7,     v24,    v19     \n\t"
        //f41
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        "vfmacc.vv      v3,     v25,    v12     \n\t"
        "vfmacc.vv      v4,     v25,    v17     \n\t"
        "vfmacc.vv      v5,     v25,    v18     \n\t"
        "vfmacc.vv      v6,     v25,    v19     \n\t"
        "vfmacc.vv      v7,     v25,    v20     \n\t"
        //f42
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        "vfmacc.vv      v3,     v26,    v13     \n\t"
        "vfmacc.vv      v4,     v26,    v18     \n\t"
        "vfmacc.vv      v5,     v26,    v19     \n\t"
        "vfmacc.vv      v6,     v26,    v20     \n\t"
        "vfmacc.vv      v7,     v26,    v21     \n\t"
        //f43
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        "vfmacc.vv      v3,     v27,    v14     \n\t"
        "vfmacc.vv      v4,     v27,    v19     \n\t"
        "vfmacc.vv      v5,     v27,    v20     \n\t"
        "vfmacc.vv      v6,     v27,    v21     \n\t"
        "vfmacc.vv      v7,     v27,    v22     \n\t"
        //f44
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        "vfmacc.vv      v3,     v28,    v15     \n\t"
        "vfmacc.vv      v4,     v28,    v20     \n\t"
        "vfmacc.vv      v5,     v28,    v21     \n\t"
        "vfmacc.vv      v6,     v28,    v22     \n\t"
        "vfmacc.vv      v7,     v28,    v23     \n\t"
        // store dst    : v0-v8
        "mv             s11,    s8              \n\t"
        "vse.v          v0,     (s11)           \n\t"
        "addi           s9,     s11,    16      \n\t"
        "vse.v          v1,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        "vse.v          v2,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        "vse.v          v3,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"

        "add            s11,    s11,    t4      \n\t"
        "vse.v          v4,     (s11)           \n\t"
        "addi           s9,     s11,    16      \n\t"
        "vse.v          v5,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        "vse.v          v6,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        "vse.v          v7,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"

        // loop control
        // loop_w
        "addi           s7,     s7,     -4      \n\t"
        "addi           s4,     s4,     64      \n\t" // src_addr_stride : 4 * 8 * 2
        "addi           s8,     s8,     64      \n\t" // dst_addr_stride : 4 * 8 * 2
        "bge            s7,     s3,     1b      \n\t"
        "beq            s7,     zero,   3f      \n\t"

        "2:                                     \n\t"
        "vmv.v.v        v0,     v29             \n\t"
        "vmv.v.v        v4,     v29             \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vmv.v.v        v1,     v29             \n\t"
        "vmv.v.v        v5,     v29             \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vmv.v.v        v2,     v29             \n\t"
        "vmv.v.v        v6,     v29             \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f00, f01, f02, f03, f04)
        "mv             t0,     %[FLT]          \n\t"
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line0 -- v8, v9, v10, v11, v12, xx, xx, xx
        //              : line1 -- v16, v17, v18, v19, v20, xx, xx, xx
        "mv             s10,    s4              \n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"

        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        // f00
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        // f01
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        ".endif                                 \n\t"

        "vfmacc.vv      v4,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        //f02
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        //f03
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        //f04
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f10, f11, f12, f13, f14)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line2 -- v8, v9, v10, v11, v12, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        // f10
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        //f11
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v25,    v11     \n\t"
        ".endif                                 \n\t"
        //f12
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        //f13
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        //f14
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f20, f21, f22, f23, f24)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line3 -- v16, v17, v18, v19, v20, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        //f20
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        //f21
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        ".endif                                 \n\t"

        "vfmacc.vv      v4,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        //f22
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        //f23
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        //f24
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f30, f31, f32, f33, f34)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line4 -- v8, v9, v10, v11, v12, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        //f30
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        //f31
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v25,    v11     \n\t"
        ".endif                                 \n\t"
        //f32
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        //f33
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        //f34
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f40, f41, f42, f43, f44)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line5 -- v16, v17, v18, v19, v20, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        //f40
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        //f41
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        ".endif                                 \n\t"

        "vfmacc.vv      v4,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        //f42
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        //f43
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        //f44
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        "vfmacc.vv      v4,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v5,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v6,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        // store dst    : v0-v8
        "mv             s11,    s8              \n\t"
        "vse.v          v0,     (s11)           \n\t"
        "addi           s9,     s11,    16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vse.v          v1,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vse.v          v2,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        ".endif                                 \n\t"

        "add            s11,    s11,    t4      \n\t"
        "vse.v          v4,     (s11)           \n\t"
        "addi           s9,     s11,    16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vse.v          v5,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vse.v          v6,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        ".endif                                 \n\t"

        // loop control
        // loop_h
        "3:                                     \n\t"
        "addi           t5,     t5,     -2      \n\t"
        "add            t1,     t1,     %[H_STD]\n\t"
        "add            t1,     t1,     %[H_STD]\n\t"
        "add            t3,     t3,     t4      \n\t"
        "add            t3,     t3,     t4      \n\t"
        "bge            t5,     s2,     0b      \n\t"
        "beq            t5,     zero,   7f      \n\t"

        "4:                                     \n\t"
        "mv             s4,     t1              \n\t"
        "mv             s8,     t3              \n\t"
        "blt            %[DT_W],s3,     6f      \n\t"
        "mv             s7,     %[DT_W]         \n\t"

        "5:                                     \n\t"
        "vmv.v.v        v0,     v29             \n\t"
        "vmv.v.v        v1,     v29             \n\t"
        "vmv.v.v        v2,     v29             \n\t"
        "vmv.v.v        v3,     v29             \n\t"
        // load filter  : v24-v28 (f00, f01, f02, f03, f04)
        "mv             t0,     %[FLT]          \n\t"
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line0 -- v8-v15
        "mv             s10,    s4              \n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v15,    (s5)            \n\t"
        // calculate
        // f00
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        "vfmacc.vv      v3,     v24,    v11     \n\t"
        // f01
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        "vfmacc.vv      v3,     v25,    v12     \n\t"
        //f02
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        "vfmacc.vv      v3,     v26,    v13     \n\t"
        //f03
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        "vfmacc.vv      v3,     v27,    v14     \n\t"
        //f04
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        "vfmacc.vv      v3,     v28,    v15     \n\t"
        // load filter  : v24-v28 (f10, f11, f12, f13, f14)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line1 -- v16-v23
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v23,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        // f10
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        "vfmacc.vv      v3,     v24,    v19     \n\t"
        //f11
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        "vfmacc.vv      v3,     v25,    v20     \n\t"
        //f12
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        "vfmacc.vv      v3,     v26,    v21     \n\t"
        //f13
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        "vfmacc.vv      v3,     v27,    v22     \n\t"
        //f14
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        "vfmacc.vv      v3,     v28,    v23     \n\t"
        // load filter  : v24-v28 (f20, f21, f22, f23, f24)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line2 -- v8-v15
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v15,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        //f20
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        "vfmacc.vv      v3,     v24,    v11     \n\t"
        //f21
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        "vfmacc.vv      v3,     v25,    v12     \n\t"
        //f22
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        "vfmacc.vv      v3,     v26,    v13     \n\t"
        //f23
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        "vfmacc.vv      v3,     v27,    v14     \n\t"
        //f24
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        "vfmacc.vv      v3,     v28,    v15     \n\t"
        // load filter  : v24-v28 (f30, f31, f32, f33, f34)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line3 -- v16-v23
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v23,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        //f30
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        "vfmacc.vv      v3,     v24,    v19     \n\t"
        //f31
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        "vfmacc.vv      v3,     v25,    v20     \n\t"
        //f32
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        "vfmacc.vv      v3,     v26,    v21     \n\t"
        //f33
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        "vfmacc.vv      v3,     v27,    v22     \n\t"
        //f34
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        "vfmacc.vv      v3,     v28,    v23     \n\t"
        // load filter  : v24-v28 (f40, f41, f42, f43, f44)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line4 -- v8-v15
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v15,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        // calculate
        //f40
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        "vfmacc.vv      v3,     v24,    v11     \n\t"
        //f41
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        "vfmacc.vv      v3,     v25,    v12     \n\t"
        //f42
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        "vfmacc.vv      v3,     v26,    v13     \n\t"
        //f43
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        "vfmacc.vv      v3,     v27,    v14     \n\t"
        //f44
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        "vfmacc.vv      v3,     v28,    v15     \n\t"
        // store dst    : v0-v3
        "mv             s11,    s8              \n\t"
        "vse.v          v0,     (s11)           \n\t"
        "addi           s9,     s11,    16      \n\t"
        "vse.v          v1,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        "vse.v          v2,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        "vse.v          v3,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        // loop_w
        "addi           s7,     s7,     -4      \n\t"
        "addi           s4,     s4,     64      \n\t"
        "addi           s8,     s8,     64      \n\t"
        "bge            s7,     s3,     5b      \n\t"
        "beq            s7,     zero,   7f      \n\t"

        "6:                                     \n\t"
        "vmv.v.v        v0,     v29             \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vmv.v.v        v1,     v29             \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vmv.v.v        v2,     v29             \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f00, f01, f02, f03, f04)
        "mv             t0,     %[FLT]          \n\t"
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line0 -- v8, v9, v10, v11, v12, xx, xx, xx
        "mv             s10,    s4              \n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        // f00
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        // f01
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        ".endif                                 \n\t"
        //f02
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        //f03
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        //f04
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f10, f11, f12, f13, f14)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line1 -- v16, v17, v18, v19, v20, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        // f10
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        //f11
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        //f12
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        //f13
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        //f14
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f20, f21, f22, f23, f24)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line2 -- v8, v9, v10, v11, v12, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        //f20
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        //f21
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        ".endif                                 \n\t"
        //f22
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        //f23
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        //f24
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f30, f31, f32, f33, f34)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line3 -- v16, v17, v18, v19, v20, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v16,    (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v17,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v18,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v19,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v20,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v21,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v22,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        //f30
        "vfmacc.vv      v0,     v24,    v16     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v17     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v18     \n\t"
        ".endif                                 \n\t"
        //f31
        "vfmacc.vv      v0,     v25,    v17     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v18     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v19     \n\t"
        ".endif                                 \n\t"
        //f32
        "vfmacc.vv      v0,     v26,    v18     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v19     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v20     \n\t"
        ".endif                                 \n\t"
        //f33
        "vfmacc.vv      v0,     v27,    v19     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v20     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v21     \n\t"
        ".endif                                 \n\t"
        //f34
        "vfmacc.vv      v0,     v28,    v20     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v21     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v22     \n\t"
        ".endif                                 \n\t"
        // load filter  : v24-v28 (f40, f41, f42, f43, f44)
        "vle.v          v24,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v25,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v26,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v27,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        "vle.v          v28,    (t0)            \n\t"
        "addi           t0,     t0,     16      \n\t"
        // load src     : line4 -- v8, v9, v10, v11, v12, xx, xx, xx
        "add            s10,    s10,    %[H_STD]\n\t"
        "vle.v          v8,     (s10)           \n\t"
        "addi           s5,     s10,    16      \n\t"
        "vle.v          v9,     (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v10,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v11,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        "vle.v          v12,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vle.v          v13,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vle.v          v14,    (s5)            \n\t"
        "addi           s5,     s5,     16      \n\t"
        ".endif                                 \n\t"
        // calculate
        //f40
        "vfmacc.vv      v0,     v24,    v8      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v24,    v9      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v24,    v10     \n\t"
        ".endif                                 \n\t"
        //f41
        "vfmacc.vv      v0,     v25,    v9      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v25,    v10     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v25,    v11     \n\t"
        ".endif                                 \n\t"
        //f42
        "vfmacc.vv      v0,     v26,    v10     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v26,    v11     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v26,    v12     \n\t"
        ".endif                                 \n\t"
        //f43
        "vfmacc.vv      v0,     v27,    v11     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v27,    v12     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v27,    v13     \n\t"
        ".endif                                 \n\t"
        //f44
        "vfmacc.vv      v0,     v28,    v12     \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vfmacc.vv      v1,     v28,    v13     \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vfmacc.vv      v2,     v28,    v14     \n\t"
        ".endif                                 \n\t"
        // store dst    : v0-v2
        "mv             s11,    s8              \n\t"
        "vse.v          v0,     (s11)           \n\t"
        "addi           s9,     s11,    16      \n\t"
        ".if ATOM_W > 1                         \n\t"
        "vse.v          v1,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        ".endif                                 \n\t"
        ".if ATOM_W > 2                         \n\t"
        "vse.v          v2,     (s9)            \n\t"
        "addi           s9,     s9,     16      \n\t"
        ".endif                                 \n\t"

        "7:                                     \n\t"
        "nop                                    \n\t"
        :
        : [ATOM_W] "i"(atom_w), [SRC] "r"(src), [FLT] "r"(flt), [DST] "r"(dst), [BIAS] "r"(bias), [H_STD] "r"(src_pad_w * 4 * 4), [DT_H] "r"(dst_h), [DT_W] "r"(dst_w)
        : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
}
